Problem set 5

Author

Kai Mao

1 Quarto

Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see https://quarto.org.

2 Problem 1 - OOP Programming

2.1 a. For the rational class, define the following:

# Load the necessary libraries
library(Rcpp)
library(methods)

# Use Rcpp to define the GCD and LCM functions
cppFunction('
#include <numeric>
int gcd(int a, int b) {
  return std::gcd(a, b);
}')

cppFunction('
#include <numeric>
int lcm(int a, int b) {
  return std::lcm(a, b);
}
')


# Set the rational class
setClass(
  "rational",
  slots = list(numerator = "integer", denominator = "integer"),
  prototype = list(numerator = integer(0), denominator = integer(1))
)

#' rational number constructor
#' 
#' @description Creates an instance of the rational class.
#' @param numerator Integer, the numerator of the rational number.
#' @param denominator Integer, the denominator of the rational number. Must not be zero.
#' @return An object of class 'rational'.
#' @examples
#' r1 <- rational(1, 2)
#' r2 <- rational(3, 4)
rational <- function(numerator = 0, denominator = 1) {
  if (!is.numeric(numerator) || !is.numeric(denominator)) {
    stop("Both numerator and denominator must be numeric.")
  }
   if (numerator != as.integer(numerator) | denominator != as.integer(denominator)) {
    stop('Both numerator and denominator can not be floating point')
  }
  if (denominator == 0) {
    stop("The denominator can not be zero.")
  }
  new("rational", numerator = as.integer(numerator), denominator = as.integer(denominator))
}


# Validator to ensure the denominator is not zero
setValidity("rational", function(object) {
  if (object@denominator == 0L) return("The denominator cannot be zero")
  TRUE
})
Class "rational" [in ".GlobalEnv"]

Slots:
                              
Name:    numerator denominator
Class:     integer     integer
# Method to display the rational number
setMethod("show", "rational", function(object) {
  cat(object@numerator, "/", object@denominator, "\n")
})

# Generic and method to simplify and quotient rational numbers
setGeneric("simplify", function(object) standardGeneric("simplify"))
[1] "simplify"
setGeneric("quotient", function(object, digits = 0) standardGeneric("quotient"))
[1] "quotient"
#' Simplify a rational number
#' 
#' @description Reduces the rational number to its simplest form using the greatest common divisor.
#' @param object A 'rational' object.
#' @return A simplified 'rational' object.
setMethod("simplify", "rational", function(object) {
    g <- gcd(object@numerator, object@denominator)
    new_numerator <- object@numerator / g
    new_denominator <- object@denominator / g
    if (new_numerator < 0 & new_denominator < 0) {
    new_numerator <- abs(new_numerator)
    new_denominator <- abs(new_denominator)
  }
    return(rational(new_numerator, new_denominator))
})

#' Calculate the decimal quotient of a rational number
#' 
#' @description Computes the decimal value of the rational number and prints it formatted to a specified number of decimal places.
#' @param object A 'rational' object.
#' @param digits Numerical, number of decimal places for formatting the output.
#' @return The quotient as a numeric value, printed but not returned.
setMethod("quotient", "rational", function(object, digits = 0) {
  if (digits < 0 | !is.numeric(digits)) {
    stop('Digit Must be greater than or equal to 0.')
  }
  quotient <- object@numerator / object@denominator
  print(formatC(quotient, format = "f", digits = digits))
  return(invisible(quotient))
})

# Methods for arithmetic operations
setMethod('+', signature(e1 = 'rational', e2 = 'rational'), function(e1, e2) {
  denom <- lcm(e1@denominator, e2@denominator)
  numer <- (e1@numerator * (denom / e1@denominator)) + 
    (e2@numerator * (denom / e2@denominator))
  return(simplify(rational(numer, denom)))
})

setMethod('-', signature(e1 = 'rational', e2 = 'rational'), function(e1, e2) {
  denom <- lcm(e1@denominator, e2@denominator)
  numer <- (e1@numerator * (denom / e1@denominator)) - 
    (e2@numerator * (denom / e2@denominator))
  return(simplify(rational(numer, denom)))
})

setMethod('*', signature(e1 = 'rational', e2 = 'rational'), function(e1, e2) {
  numer <- e1@numerator * e2@numerator
  denom <- e1@denominator * e2@denominator
  return(simplify(rational(numer, denom)))
})

setMethod('/', signature(e1 = 'rational', e2 = 'rational'), function(e1, e2) {
  numer <- e1@numerator * e2@denominator
  denom <- e1@denominator * e2@numerator
  if (denom == 0L) stop('division by zero')
  return(simplify(rational(numer, denom)))
})

2.2 b. Use your rational class to create three objects:

r1 <- rational(24, 6)
r2 <- rational(7, 230)
r3 <- rational(0, 4)
r1
24 / 6 
r3
0 / 4 
r1 + r2
927 / 230 
r1 - r2
913 / 230 
r1 * r2
14 / 115 
r1 / r2
920 / 7 
r1 + r3
4 / 1 
r1 * r3
0 / 1 
r2 / r3
Error in r2/r3: division by zero
quotient(r1)
[1] "4"
quotient(r2)
[1] "0"
quotient(r2, digits = 3)
[1] "0.030"
quotient(r2, digits = 3.14)
[1] "0.030"
quotient(r2, digits = 'avocado')
Error in quotient(r2, digits = "avocado"): Digit Must be greater than or equal to 0.
q2 <- quotient(r2, digits = 3)
[1] "0.030"
q2
[1] 0.03043478
quotient(r3)
[1] "0"
simplify(r1)
4 / 1 
simplify(r2)
7 / 230 
simplify(r3)
0 / 1 

2.3 c. Show that your validator does not allow the creation of rational’s with 0 denominator, and check other malformed input to your constructor.

# Testing the constructor with valid and invalid inputs
tryCatch({
  r1 <- rational(1, 2)  # Valid input
  show(r1)
}, error = function(e) {
  print(e)
})
1 / 2 
tryCatch({
  r2 <- rational(1, 0)  # Invalid input, denominator is zero
  show(r2)
}, error = function(e) {
  print(e)
})
<simpleError in rational(1, 0): The denominator can not be zero.>
tryCatch({
  r3 <- rational(1.5, 2.5)  # Invalid input, non-integer values
  show(r3)
}, error = function(e) {
  print(e)
})
<simpleError in rational(1.5, 2.5): Both numerator and denominator can not be floating point>
tryCatch({
  r4 <- rational("one", "two")  # Invalid input, non-numeric values
  show(r4)
}, error = function(e) {
  print(e)
})
<simpleError in rational("one", "two"): Both numerator and denominator must be numeric.>

3 2. Problem 2 - plotly

3.1 a. Does the distribution of genre of sales across years appear to change?

3.1.1 Code

# Load the libraries
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(plotly)
Loading required package: ggplot2

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
# Load the data
art <- read.csv("C:\\Users\\Feixing\\Desktop\\stats 506\\data\\Problem Set 4\\df_for_ml_improved_new_market.csv")

# Correct overlapping data
art$Genre___Others[art$Genre___Painting == 1] <- 0

# Create a single categorical variable
art$genre <- ifelse(art$Genre___Painting == 1, "Painting",
                    ifelse(art$Genre___Photography == 1, "Photography",
                           ifelse(art$Genre___Print == 1, "Print",
                                  ifelse(art$Genre___Sculpture == 1, "Sculpture",
                                         ifelse(art$Genre___Others == 1, "Other", NA)))))

# Data grouping and scale calculation
art_genre_year <- art %>%
  group_by(year, genre) %>%
  summarise(count = n(), .groups = 'drop') %>%
  group_by(year) %>%
  mutate(total = sum(count)) %>%
  ungroup() %>%
  mutate(frequency = count / total)

# Plot the stack area
p <- plot_ly(data = art_genre_year, x = ~year, y = ~frequency, type = 'scatter', mode = 'lines',
             stackgroup = 'one', color = ~genre) %>%
  layout(title = "Distribution of Art Sales Genre Across Years",
         xaxis = list(title = "Year"),
         yaxis = list(title = "Proportion", tickformat = ",.0%"))

# Output the plot
p

3.1.2 Conclusion:

Painting: Initially, painting occupied a significant portion of the market in the early 2000s, but its share declined over time, particularly after 2005. This may indicate the rise of other art genres or changes in consumer preferences.

Photography: The proportion of photography was relatively low before 2000, but began to increase steadily starting in 2000, especially from 2005 to 2010. This significant increase may reflect the widespread adoption of digital photography technology and growing market interest in photographic art.

Sculpture: The proportion of sculpture sales remained relatively stable throughout the period, indicating a balanced market demand.

Print: The proportion of prints has risen since the 2000s, particularly after 2010, which may be associated with the growing popularity of prints as collectibles and decorative items.

Other: This category maintained a small proportion throughout the period, indicating it may include niche or specialized art genres.

3.2 b. Generate an interactive plot with plotly that can address both of these questions

3.2.1 Code

library(plotly)
library(dplyr)

# Load the data
art_sales <- read.csv("C:\\Users\\Feixing\\Desktop\\stats 506\\data\\Problem Set 4\\df_for_ml_improved_new_market.csv")

art_sales$Genre___Others[art_sales$Genre___Painting == 1] <- 0
art_sales <- art_sales %>%
  mutate(genre = case_when(
    Genre___Painting == 1 ~ "Painting",
    Genre___Photography == 1 ~ "Photography",
    Genre___Print == 1 ~ "Print",
    Genre___Sculpture == 1 ~ "Sculpture",
    Genre___Others == 1 ~ "Other",
    TRUE ~ NA_character_
  ))

# Calculate the average price of each art type for each year
yearly_prices <- art_sales %>%
  group_by(year, genre) %>%
  summarise(mean_price = mean(price_usd, na.rm = TRUE), .groups = 'drop')

# Calculate the average price for all types for each year
yearly_avg_prices <- art_sales %>%
  group_by(year) %>%
  summarise(yearly_avg_price = mean(price_usd, na.rm = TRUE), .groups = 'drop')

# Create interactive charts
plot <- plot_ly() %>%
  add_trace(data = yearly_prices, x = ~year, y = ~mean_price, type = 'scatter', 
            mode = 'lines+markers', color = ~genre, 
            colors = RColorBrewer::brewer.pal(5, "Set1"), hoverinfo = 'text', 
            text = ~paste('Year:', year, '<br>Mean Price: $', round(mean_price, 2))) %>%
  add_trace(data = yearly_avg_prices, x = ~year, y = ~yearly_avg_price, type = 'scatter', 
            mode = 'lines', line = list(color = 'black', width = 2), hoverinfo = 'text',
            text = ~paste('Yearly Average: $', round(yearly_avg_price, 2)),
            name = 'Yearly Average') %>%
  layout(title = "Change in Sales Price Over Time by Art Genre",
         xaxis = list(title = "Year"),
         yaxis = list(title = "Average Sales Price (USD)"),
         hovermode = 'closest')

# Output the result
plot

3.2.2 Conclusion:

Changes in Sales Prices Over Time:
Overall, the sales prices across various art genres show a trend of volatile growth. Particularly from 2000 to 2008, most genres saw significant price increases. Despite some subsequent declines, the general trend remained upward.

How Art Genres Influence Changes in Sales Prices:
Painting: Prices were relatively stable until 2005, then rose significantly, especially peaking in 2008 before declining thereafter.
Photography: Prices gradually increased starting in 2000, peaking between 2007 and 2008, and then slightly declining. The price fluctuations in photography may be related to advancements in technology and market acceptance.
Print: Post-2005, prices began to rise significantly, peaking in 2008 and then declining. The increased market demand for prints may have contributed to the rise in prices.
Sculpture: Exhibited minor price fluctuations with a generally stable trend, though there was an increase in 2008.
Other: Price fluctuations were minor and relatively stable, without significant long-term increases or decreases.

4 Problem 3 - data.table

4.1 a. Generate a table (which can just be a nicely printed tibble) reporting the mean and median departure delay per airport. Generate a second table (which again can be a nicely printed tibble) reporting the mean and median arrival delay per airport. Exclude any destination with under 10 flights.

4.1.1 Code

# Load the necessary libraries
library(data.table)

Attaching package: 'data.table'
The following objects are masked from 'package:dplyr':

    between, first, last
library(nycflights13)

# Load the data.table
flights_dt <- as.data.table(flights)
airports_dt <- as.data.table(airports)

# Departure data
dep_delay_stats <- flights_dt[ , .(
  mean_dep_delay = mean(dep_delay, na.rm = TRUE),
  median_dep_delay = median(dep_delay, na.rm = TRUE),
  num_flights = .N
), by = origin][num_flights >= 10][order(-mean_dep_delay)]

# Convert the airport code to the airport name
dep_delay_stats[, origin := airports_dt[.SD, on = .(faa = origin), x.name]]

# Arrival data
arr_delay_stats <- flights_dt[ , .(
  mean_arr_delay = mean(arr_delay, na.rm = TRUE),
  median_arr_delay = median(arr_delay, na.rm = TRUE),
  num_flights = .N
), by = dest][num_flights >= 10][order(-mean_arr_delay)]

# Convert the airport code to the airport name
arr_delay_stats[, dest := airports_dt[.SD, on = .(faa = dest), x.name]]

# Print results
print(dep_delay_stats[, .(origin, mean_dep_delay, median_dep_delay)])
                origin mean_dep_delay median_dep_delay
                <char>          <num>            <num>
1: Newark Liberty Intl       15.10795               -1
2: John F Kennedy Intl       12.11216               -1
3:          La Guardia       10.34688               -3
print(arr_delay_stats[, .(dest, mean_arr_delay, median_arr_delay)])
                          dest mean_arr_delay median_arr_delay
                        <char>          <num>            <num>
  1:     Columbia Metropolitan      41.764151             28.0
  2:                Tulsa Intl      33.659864             14.0
  3:         Will Rogers World      30.619048             16.0
  4:      Jackson Hole Airport      28.095238             15.0
  5:             Mc Ghee Tyson      24.069204              2.0
 ---                                                          
 98:       Seattle Tacoma Intl      -1.099099            -11.0
 99:             Honolulu Intl      -1.365193             -7.0
100:                      <NA>      -3.835907             -9.0
101: John Wayne Arpt Orange Co      -7.868227            -11.0
102:         Palm Springs Intl     -12.722222            -13.5

4.2 b. How many flights did the aircraft model with the fastest average speed take?

# Load the libraries
library(data.table)
library(nycflights13)

# Load the data.table
flights_dt <- data.table(flights)
planes_dt <- data.table(planes)

# Calculate the speed
speed_flights <- flights_dt[, .(distance, air_time), by = .(tailnum)]
speed_flights[, speed := distance / air_time * 60]

model_speeds <- merge(speed_flights, planes_dt[, .(tailnum, model)], by = "tailnum")
avg_speed_flights <- model_speeds[, .(
  average_speed = mean(speed, na.rm = TRUE),
  num_flights = .N
), by = model]

# Find the fastest average speed model
fastest_model <- avg_speed_flights[which.max(average_speed)]
fastest_model_df <- data.table(model = fastest_model$model, average_speed = fastest_model$average_speed, num_flights = fastest_model$num_flights)

# Output the data
print(fastest_model_df)
     model average_speed num_flights
    <char>         <num>       <int>
1: 777-222      482.6254           4

5 References and Data Sources

The datasets used in this project are obtained from the following sources:

  • nycflights13 Data: Provides airline on-time data for all flights departing NYC in 2013. This comprehensive dataset includes metadata on airlines, airports, weather, and planes. Available from the Comprehensive R Archive Network (CRAN) at https://CRAN.R-project.org/package=nycflights13. Maintained by Hadley Wickham.

  • df_for_ml_improved_new_market: This dataset enumerates characteristics of art sales, including the sale prices and details about the artworks sold. It is intended for creating publication-ready plots that elucidate market trends and valuations in art sales. More information and dataset download are available at Repository Link.

6 Methods and Implementation

All data analyses were performed in the R environment, employing a range of techniques including data import, data cleaning, statistical analysis, and result visualization.

7 Code and Documentation Repository

This document and related code are hosted on GitHub for review and sharing purposes. Access link: GitHub Repository Link

8 Acknowledgements

Thanks to the course instructors and teaching assistants for their guidance on this assignment. Thanks to all data providers for supporting open data.

9 Notes

The analyses in this document are for academic purposes only, intended to fulfill the requirements of a statistics course. While every reasonable effort has been made to ensure the accuracy of the analysis results, the content of this document represents only the views of the author.